*Set up
clear all
set memory 5g

set more off

*N.B.: Unweighted localization file;

use "\\ead02\ead_uquam\Localization\NAICS6_panel\cdf_rhs_rev2.dta", clear

cd "\\ead02\ead_uquam\Localization\restat_results"

*************************************************************************************VARIABLE CONSTRUCTION************************************************************************************************

*TSET FUNCTION;

destring naics, replace
destring oecd80, replace

xtset naics year, delta(1)


//LHS: INCREMENTAL CDF

*log transformed

generate ln_cdf10 = ln(cdf10)
generate ln_cdf25 = ln(cdf25)
generate ln_cdf50 = ln(cdf50)
generate ln_cdf100 = ln(cdf100)
generate ln_cdf200 = ln(cdf200)
generate ln_cdf500 = ln(cdf500)
generate ln_cdf800 = ln(cdf800)

gen ln_cdf10_25=ln(cdf25-cdf10)
gen ln_cdf10_50=ln(cdf50-cdf10)
gen ln_cdf25_50=ln(cdf50-cdf25)
gen ln_cdf50_100=ln(cdf100-cdf50)
gen ln_cdf100_200=ln(cdf200-cdf100)
gen ln_cdf100_500=ln(cdf500-cdf100)
gen ln_cdf200_500=ln(cdf500-cdf200)
gen ln_cdf500_800=ln(cdf800-cdf500)
gen cdf500_800=cdf800-cdf500
gen ln_cdf10_200=ln(cdf200-cdf10)

*untransformed for standardized variable

gen cdf10_25=cdf25-cdf10
gen cdf25_50=cdf50-cdf25
gen cdf50_100=cdf100-cdf50
gen cdf100_200=cdf200-cdf100
gen cdf200_500=cdf500-cdf200


//RHS: ADDITIONAL VARIABLE GENERATION;

gen totalemp = salemp + prdwrk
gen rdshr=rd*1000/vsm

generate ln_av = ln(av_klems)
generate ln_ifql3 = ln(ifql3)
generate ln_ifql4 = ln(ifql4)

gen lnifql34=ln(ifql3+ifql4)

generate lnefown = ln(efown)
generate lnnfown = ln(nfown)
generate lnemulti = ln(emulti1)
generate lnnmulti = ln(nmulti1)
generate lnm_emp = ln(m_emp)
generate lnvsm = ln(vsm)
generate lnvam = ln(vam)
generate lnmat_sales = ln(mat_sales)
generate lnmat_va = ln(mat_va)
generate lnnprdprd = ln(nprdprd)
generate lnnprdprdi = ln(nprdprdi)
generate lnnprdprdis = ln(nprdprdis)

gen lnnprdprd2 = lnnprdprd*lnnprdprd
gen lnnprdprdi2 = lnnprdprdi*lnnprdprdi

generate lnman = ln(man)
generate lnbss = ln(bss)
generate lnnrs = ln(nrs)
generate lnutl = ln(utl)
generate lnherf = ln(herf)
generate lnherfent = ln(herfent)
generate lnemp = ln(salemp + prdwrk)
gen ln_empl = ln(salemp+prdwrk)
gen empl = salemp+prdwrk
gen lnemplshr=ln(salemp/(salemp+prdwrk))
gen lnplantsn=ln( plantsn)
generate lnavsq = lnav_klems*lnav_klems
generate lnmlwxlnifql2 = lnm_hw*lnifql2
generate lnifql4xlnifqk2= lnifql4* lnifqk2
generate lnifql4xlnifqk3= lnifql4* lnifqk3

generate lnhc = ln(ifqh4/ifqh)
generate lnhcxlnifqk2 = lnhc*lnifqk2

//generate proxy for labor market pooling: 

gen test = ln(salemp/prdwrk)
gen test1 = ln(lnifqh4/(lnifqh2+lnifqh3+lnifqh4))
gen test2 = ln(lnifqh3/(lnifqh2+lnifqh3+lnifqh4))

*hours worked

gen ifqh2shr=ifqh2/ifqh
gen ifqh3shr=ifqh3/ifqh
gen ifqh4shr=ifqh4/ifqh
gen ifqh34shr=(ifqh3+ifqh4)/ifqh
gen ifqh23shr=(ifqh2+ifqh3)/ifqh

gen lnifqh2shr=ln(ifqh2shr)
gen lnifqh3shr=ln(ifqh3shr)
gen lnifqh4shr=ln(ifqh4shr)
gen lnifqh23shr=ln(ifqh23shr)
gen lnifqh34shr=ln((ifqh3+ifqh4)/ifqh)

gen lnifqh2shr2=lnifqh2shr*lnifqh2shr
gen lnifqh3shr2=lnifqh3shr*lnifqh3shr
gen lnifqh4shr2=lnifqh4shr*lnifqh4shr
gen lnifqh23shr2=lnifqh23shr*lnifqh23shr

*proxy for inputs sharing: Export intensity
gen lnexpint=ln(export_tot/vsm)

*proxying R&D intensity: R&D expenditure / total output
gen lnrd=ln(rd/vsm)
gen lnrd1=ln(rd)
gen lnrds=ln(rds/vsm)

gen rdlshr=rdl/vsml
gen lnrdl=ln(rdl*1000/vsml)
gen lnrdls=ln(rdls/vsml)

*generate proxy for natural advantage
gen lnpee =ln(pee/pvv)
*gen lnpee =ln(pee_gn)/ln(pvv_gn)
gen peeshr=pee/pvv

*input and output distance

gen lnl_idist_n10=ln(l_idist_n10)
gen lnl_odist_n10=ln(l_odist_n10)
gen lnl_idist_n7=ln(l_idist_n7)
gen lnl_odist_n7=ln(l_odist_n7)
gen lnl_idist_n5=ln(l_idist_n5)
gen lnl_odist_n5=ln(l_odist_n5)
gen lnl_idist_n3=ln(l_idist_n3)
gen lnl_odist_n3=ln(l_odist_n3)

gen lndistn2=ln(distn2)
gen lndistn3=ln(distn3)
gen lndistn5=ln(distn5)
gen lndistn7=ln(distn7)
gen lndistn10=ln(distn10)

//Input and output distance imputed

gen lnl_idist_n10i=ln(l_idist_n10i)
gen lnl_odist_n10i=ln(l_odist_n10i)
gen lnl_idist_n7i=ln(l_idist_n7i)
gen lnl_odist_n7i=ln(l_odist_n7i)
gen lnl_idist_n5i=ln(l_idist_n5i)
gen lnl_odist_n5i=ln(l_odist_n5i)
gen lnl_idist_n3i=ln(l_idist_n3i)
gen lnl_odist_n3i=ln(l_odist_n3i)

gen lndistn2i=ln(distn2i)
gen lndistn3i=ln(distn3i)
gen lndistn5i=ln(distn5i)
gen lndistn7i=ln(distn7i)
gen lndistn10i=ln(distn10i)



//Aggregate industry classifications

**2- and 3-digit NAICS
gen naics2=floor(naics/10000)
gen naics3=floor(naics/1000)

**textiles dumies
gen textile=0
replace textile=1 if naics==313110 | naics==313110 | naics==313210 | naics== 313220 | naics==313230 | naics==313240 | naics==313310 | naics==313320 | naics==314110 | naics==314120 | naics==314910 | naics==314990 | naics==315110 | naics==315190 | naics==315210 | naics==315221 | naics==315222 | naics==315226 | naics==315227 | naics==315229 | naics==315231 | naics==315232 | naics==315233 | naics==315234 | naics==315239 | naics==315291 | naics==315292 | naics==315299 | naics==315990

**high tech dummies
gen level1=0
replace level1=1 if naics==325410 | naics==334110 | naics==334511 | naics==334512 | naics==336410 | naics==334210 | naics==334220 | naics==334290 | naics==334410 
gen level2=0
replace level2=1 if naics==333310 | naics==334610 | naics==325110 | naics==333210 | naics==333220 | naics==333291 | naics==333299 | naics==334310 | naics==325210 | naics==325220
gen level3=0
replace level3=1 if naics==335311 | naics==335312 | naics==335315 | naics==325910 | naics==325920 | naics==325991 | naics==325999 | naics==333910 | naics==333920 | naics==333990 | naics==333611 | naics==333619 | naics==325510 | naics==325520 | naics==324110 | naics==324121 | naics==324122 | naics==324190 | naics==325313 | naics==325314 | naics==325320 | naics==336990
gen level4=0
replace level4=1 if naics==332910 | naics==332991 | naics==332999 | naics==332991 | naics==335910 | naics==335920 | naics==335930 | naics==335990 | naics==336110 | naics==336120 | naics==333511 | naics==333519 | naics==332410 | naics==332420 | naics==332431 | naics==332439 | naics==336211 | naics==336212 | naics==336215
gen high_comp=level1+level2 +level3+level4
gen hightech=0
replace hightech=1 if high_comp>0
drop high_comp


//The full model is run to set a consistent sample for the models with variable subsets

*xi: xtreg  ln_cdf50 ln_empl lnherfent lnm_emp nmulti1 nfown lnnrs lnpee m_asiashr m_oecdshr m_naftashr x_asiashr x_oecdshr x_naftashr lnav_klems lnl_idist_n5 lnl_odist_n5  lnnprdprd lnnprdprd2 lnrdl i.year, fe cluster(naics)
xi: xtreg  ln_cdf50 lnav_klems ln_empl lnherfent lnm_emp nmulti1 nfown lnnrs lnpee lnifqh3shr lnrdl m_asiashr m_oecdshr m_naftashr x_asiashr x_oecdshr x_naftashr lnl_idist_n5 lnl_odist_n5 lndistn5 i.year, fe cluster(naics)
keep if e(sample)

//Generate the residual of transport variable

xtreg lnav_klems lnmfpa i.year, fe  cluster(naics)
predict double xb
gen lnav_klems_resid=lnav_klems-xb


save "\\ead02\ead_uquam\Localization\NAICS6_panel\restat_final.dta", replace

*end

*********************************************************************************END DATA CONSTRUCTION AND TESTING*********************************************************************************
